import traceback
import uuid
from typing import Dict, List, Optional

from swarms.structs.agent import Agent
from swarms.structs.conversation import Conversation

# =============================================================================
# PROMPT FUNCTIONS FOR AGENT JUDGE
# =============================================================================


def get_reward(input: str) -> int:
    """
    Determines whether the input contains any positive evaluation keywords and returns a reward.

    This function checks if the input string contains any of the following words (case-insensitive):
    "correct", "good", "excellent", or "perfect". If any of these words are present, the function
    returns 1 as a reward, otherwise it returns 0.

    Args:
        input (str): The input string to evaluate.

    Returns:
        int: 1 if a positive evaluation keyword is found, 0 otherwise.

    Example:
        >>> get_reward("That is correct!")
        1
        >>> get_reward("Needs improvement.")
        0
    """
    words = [
        "correct",
        "good",
        "excellent",
        "perfect",
    ]

    if any(word in input.lower() for word in words):
        return 1
    else:
        return 0


def get_agent_judge_prompt() -> str:
    """
    Returns the main system prompt for the agent judge.

    Returns:
        str: The system prompt for the agent judge
    """
    return """# Adaptive Output Evaluator - Role and Protocol

Your role is to critically evaluate outputs across diverse domains by first understanding the context, then applying domain-appropriate evaluation criteria to provide a well-reasoned assessment.

## Core Responsibilities

1. **Context Assessment**
  - Begin by identifying the domain and specific context of the evaluation (technical, creative, analytical, etc.)
  - Determine the appropriate evaluation framework based on domain requirements
  - Adjust evaluation criteria and standards to match domain-specific best practices
  - If domain is unclear, request clarification with: DOMAIN CLARIFICATION NEEDED: *specific_question*

2. **Input Validation**
  - Ensure all necessary information is present for a comprehensive evaluation
  - Identify gaps in provided materials that would impact assessment quality
  - Request additional context when needed with: ADDITIONAL CONTEXT NEEDED: *specific_information*
  - Consider implicit domain knowledge that may influence proper evaluation

3. **Evidence-Based Analysis**
  - Apply domain-specific criteria to evaluate accuracy, effectiveness, and appropriateness
  - Distinguish between factual claims, reasoned arguments, and subjective opinions
  - Flag assumptions or claims lacking sufficient support within domain standards
  - Evaluate internal consistency and alignment with established principles in the field
  - For technical domains, verify logical and methodological soundness

4. **Comparative Assessment**
  - When multiple solutions or approaches are presented, compare relative strengths
  - Identify trade-offs between different approaches within domain constraints
  - Consider alternative interpretations or solutions not explicitly mentioned
  - Balance competing priorities based on domain-specific values and standards

5. **Final Assessment Declaration**
  - Present your final assessment with: **EVALUATION_COMPLETE \\boxed{_assessment_summary_}**
  - Follow with a concise justification referencing domain-specific standards
  - Include constructive feedback for improvement where appropriate
  - When appropriate, suggest alternative approaches that align with domain best practices"""


def get_task_evaluation_prompt(outputs: str) -> str:
    """
    Returns the task instruction prompt for evaluation.

    Args:
        outputs (str): The outputs to be evaluated

    Returns:
        str: The formatted task evaluation prompt
    """
    return f"""You are an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance. Your feedback should address the following points:

1. Strengths: What did the agent do well? Highlight any correct reasoning, clarity, or effective problem-solving.
2. Weaknesses: Identify any errors, omissions, unclear reasoning, or areas where the output could be improved.
3. Suggestions: Offer specific, practical recommendations for how the agent can improve its next attempt. This may include advice on reasoning, structure, completeness, or style.
4. If relevant, point out any factual inaccuracies or logical inconsistencies.

Be thorough, objective, and professional. Your goal is to help the agent learn and produce better results in the future.

Output(s) to evaluate:
{outputs}"""


# =============================================================================
# EXCEPTION CLASSES
# =============================================================================


class AgentJudgeInitializationError(Exception):
    """
    Exception raised when there is an error initializing the AgentJudge.
    """

    pass


class AgentJudgeExecutionError(Exception):
    """
    Exception raised when there is an error executing the AgentJudge.
    """

    pass


class AgentJudge:
    """
    A specialized agent designed to evaluate and judge outputs from other agents or systems.

    The AgentJudge acts as a quality control mechanism, providing objective assessments
    and feedback on various types of content, decisions, or outputs. It's based on research
    in LLM-based evaluation systems and can maintain context across multiple evaluations.

    This implementation supports both single task evaluation and batch processing with
    iterative refinement capabilities.

    Attributes:
        id (str): Unique identifier for the judge agent instance.
        agent_name (str): The name of the agent judge.
        system_prompt (str): The system prompt for the agent containing evaluation instructions.
        model_name (str): The model name used for generating evaluations (e.g., "openai/o1", "gpt-4").
        conversation (Conversation): An instance of the Conversation class to manage conversation history.
        max_loops (int): The maximum number of evaluation iterations to run.
        verbose (bool): Whether to enable verbose logging.
        agent (Agent): An instance of the Agent class that performs the evaluation execution.

        evaluation_criteria (Dict[str, float]): Dictionary of evaluation criteria and their weights.

    Example:
        Basic usage for evaluating agent outputs:

        ```python
        from swarms import AgentJudge

        # Initialize the judge
        judge = AgentJudge(
            agent_name="quality-judge",
            model_name="gpt-4",
            max_loops=1
        )

        # Evaluate a single output
        output = "The capital of France is Paris."
        evaluation = judge.step(task=output)

        # Evaluate multiple outputs with context building
        outputs = [
            "Agent response 1: The calculation is 2+2=4",
            "Agent response 2: The weather is sunny today"
        ]
        evaluations = judge.run(tasks=outputs)
        ```

    Methods:
        step(task: str = None, img: str = None) -> str:
            Processes a single task and returns the agent's evaluation.
        run(task: str = None, img: str = None) -> List[str]:
            Executes evaluation in a loop with context building, collecting responses.

        run_batched(tasks: List[str] = None, imgs: List[str] = None) -> List[str]:
            Executes batch evaluation of tasks with corresponding images.
    """

    def __init__(
        self,
        id: str = str(uuid.uuid4()),
        agent_name: str = "Agent Judge",
        description: str = "You're an expert AI agent judge. Carefully review the following output(s) generated by another agent. Your job is to provide a detailed, constructive, and actionable critique that will help the agent improve its future performance.",
        system_prompt: str = None,
        model_name: str = "openai/o1",
        max_loops: int = 1,
        verbose: bool = False,
        evaluation_criteria: Optional[Dict[str, float]] = None,
        return_score: bool = False,
        *args,
        **kwargs,
    ):
        self.id = id
        self.agent_name = agent_name
        self.system_prompt = system_prompt
        self.model_name = model_name
        self.conversation = Conversation(time_enabled=False)
        self.max_loops = max_loops
        self.verbose = verbose
        self.return_score = return_score
        self.evaluation_criteria = evaluation_criteria or {}

        self.agent = Agent(
            agent_name=agent_name,
            agent_description=description,
            system_prompt=self.enhanced_prompt(),
            model_name=model_name,
            max_loops=1,
            *args,
            **kwargs,
        )

        self.reliability_check()

    def reliability_check(self):
        if self.max_loops == 0 or self.max_loops is None:
            raise ValueError(
                f"AgentJudge: {self.agent_name} max_loops must be greater than 0"
            )

        if self.model_name is None:
            raise ValueError(
                f"AgentJudge: {self.agent_name} model_name must be provided"
            )

    def enhanced_prompt(self):
        # Enhance system prompt with evaluation criteria if provided
        enhanced_prompt = (
            self.system_prompt or get_agent_judge_prompt()
        )
        if self.evaluation_criteria:
            criteria_str = "\n\nEvaluation Criteria:\n"
            for criterion, weight in self.evaluation_criteria.items():
                criteria_str += f"- {criterion}: weight = {weight}\n"
            enhanced_prompt += criteria_str

        return enhanced_prompt

    def step(
        self,
        task: str = None,
        img: Optional[str] = None,
    ) -> str:
        """
        Processes a single task and returns the agent's evaluation.

        This method performs a one-shot evaluation of the provided content. It takes
        a single task string (response from another LLM or agent) and generates a
        comprehensive evaluation with strengths, weaknesses, and improvement suggestions.

        Args:
            task (str, optional): The response from another LLM or agent to be evaluated.
            img (str, optional): Path to an image file for multimodal evaluation.

        Returns:
            str: A detailed evaluation response from the agent including:
                - Strengths: What the agent/output did well
                - Weaknesses: Areas that need improvement
                - Suggestions: Specific recommendations for improvement
                - Factual accuracy assessment

        Raises:
            ValueError: If no task is provided.

        Example:
            ```python
            # Single task evaluation
            evaluation = judge.step(task="The answer is 42.")

            # Multimodal evaluation
            evaluation = judge.step(
                task="The agent described this image as a cat",
                img="path/to/image.jpg"
            )
            ```
        """
        try:

            # Use the predefined task evaluation prompt
            task_instruction = get_task_evaluation_prompt(
                outputs=task
            )

            # Add evaluation criteria if provided
            if self.evaluation_criteria:
                criteria_str = "\n\nPlease use these specific evaluation criteria with their respective weights:\n"
                for (
                    criterion,
                    weight,
                ) in self.evaluation_criteria.items():
                    criteria_str += (
                        f"- {criterion}: weight = {weight}\n"
                    )
                task_instruction += criteria_str

            response = self.agent.run(
                task=task_instruction,
                img=img,
            )

            return response
        except Exception as e:
            error_message = f"AgentJudge: {self.agent_name} encountered an error: {e}\n Traceback: {traceback.format_exc()}"
            raise AgentJudgeExecutionError(error_message)

    def run(
        self,
        task: str = None,
        img: Optional[str] = None,
    ):
        """
        Executes evaluation in a loop with context building, collecting responses.

        This method runs the evaluation multiple times (up to max_loops) to build
        context and provide iterative feedback. Each iteration uses the previous
        response as context for the next evaluation.

        Args:
            task (str, optional): The response from another LLM or agent to be evaluated.
            img (str, optional): Path to an image file for multimodal evaluation.

        Returns:
            List[str]: A list of evaluation responses from each iteration.

        Example:
            ```python
            # Evaluate a response with multiple iterations
            responses = judge.run(task="The agent said: Paris is the capital of France")

            # Multimodal evaluation with multiple iterations
            responses = judge.run(
                task="The agent described this image as a cat",
                img="path/to/image.jpg"
            )
            ```
        """
        try:
            # The agent will run in a loop, remembering and updating the conversation context at each step.
            self.conversation.add(role="user", content=task)
            for _ in range(self.max_loops):
                # Retrieve the full conversation context as a string
                context = self.conversation.get_str()
                # Build the contextualized task, always including the full conversation so far
                contextualized_task = f"{context}\n"
                # Get response for current iteration
                current_response = self.step(
                    task=contextualized_task,
                    img=img,
                )
                # Add the agent's response to the conversation history
                self.conversation.add(
                    role=self.agent.agent_name,
                    content=current_response,
                )
                # The context will be updated automatically in the next loop iteration

            # After all loops, return either the reward or the full conversation
            if self.return_score:
                return get_reward(self.conversation.get_str())
            else:
                return self.conversation.get_str()
        except Exception as e:
            error_message = f"AgentJudge: {self.agent_name} encountered an error: {e}\n Traceback: {traceback.format_exc()}"
            raise AgentJudgeExecutionError(error_message)

    def run_batched(
        self,
        tasks: Optional[List[str]] = None,
    ):
        """
        Runs the agent judge on a batch of tasks.

        Args:
            tasks (Optional[List[str]]): A list of tasks (strings) to be evaluated.

        Returns:
            List[List[str]]: A list where each element is the list of evaluation responses
                             for the corresponding task.
        """
        outputs = []
        for task in tasks:
            outputs.append(self.run(task=task))
        return outputs
